//===-- Strlen implementation for aarch64 ---------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H

#include "src/__support/macros/properties/cpu_features.h"

#if defined(__ARM_NEON)
#include "src/__support/CPP/bit.h" // countr_zero
#include <arm_neon.h>
#include <stddef.h> // size_t
namespace LIBC_NAMESPACE_DECL {
namespace neon {
[[maybe_unused]] LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static size_t
string_length(const char *src) {
  using Vector __attribute__((may_alias)) = uint8x8_t;

  uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
  const Vector *block_ptr =
      reinterpret_cast<const Vector *>(src - misalign_bytes);
  Vector v = *block_ptr;
  Vector vcmp = vceqz_u8(v);
  uint64x1_t cmp_mask = vreinterpret_u64_u8(vcmp);
  uint64_t cmp = vget_lane_u64(cmp_mask, 0);
  cmp = cmp >> (misalign_bytes << 3);
  if (cmp)
    return cpp::countr_zero(cmp) >> 3;

  while (true) {
    ++block_ptr;
    v = *block_ptr;
    vcmp = vceqz_u8(v);
    cmp_mask = vreinterpret_u64_u8(vcmp);
    cmp = vget_lane_u64(cmp_mask, 0);
    if (cmp)
      return static_cast<size_t>(reinterpret_cast<uintptr_t>(block_ptr) -
                                 reinterpret_cast<uintptr_t>(src) +
                                 (cpp::countr_zero(cmp) >> 3));
  }
}
} // namespace neon
} // namespace LIBC_NAMESPACE_DECL
#endif // __ARM_NEON

#ifdef LIBC_TARGET_CPU_HAS_SVE
#include "src/__support/macros/optimization.h"
#include <arm_sve.h>
namespace LIBC_NAMESPACE_DECL {
namespace sve {
[[maybe_unused]] LIBC_INLINE static size_t string_length(const char *src) {
  const uint8_t *ptr = reinterpret_cast<const uint8_t *>(src);
  // Initialize the first-fault register to all true
  svsetffr();
  const svbool_t all_true = svptrue_b8(); // all true predicate
  svbool_t cmp_zero;
  size_t len = 0;

  for (;;) {
    // Read a vector's worth of bytes, stopping on first fault.
    svuint8_t data = svldff1_u8(all_true, &ptr[len]);
    svbool_t fault_mask = svrdffr_z(all_true);
    bool has_no_fault = svptest_last(all_true, fault_mask);
    if (LIBC_LIKELY(has_no_fault)) {
      // First fault did not fail: the whole vector is valid.
      // Avoid depending on the contents of FFR beyond the branch.
      len += svcntb(); // speculative increment
      cmp_zero = svcmpeq_n_u8(all_true, data, 0);
      bool has_no_zero = !svptest_any(all_true, cmp_zero);
      if (LIBC_LIKELY(has_no_zero))
        continue;
      len -= svcntb(); // undo speculative increment
      break;
    } else {
      // First fault failed: only some of the vector is valid.
      // Perform the comparison only on the valid bytes.
      cmp_zero = svcmpeq_n_u8(fault_mask, data, 0);
      bool has_zero = svptest_any(fault_mask, cmp_zero);
      if (LIBC_LIKELY(has_zero))
        break;
      svsetffr();
      len += svcntp_b8(all_true, fault_mask);
      continue;
    }
  }
  // Select the bytes before the first and count them.
  svbool_t before_zero = svbrkb_z(all_true, cmp_zero);
  len += svcntp_b8(all_true, before_zero);
  return len;
}
} // namespace sve
} // namespace LIBC_NAMESPACE_DECL
#endif // LIBC_TARGET_CPU_HAS_SVE

namespace LIBC_NAMESPACE_DECL {
#ifdef LIBC_TARGET_CPU_HAS_SVE
namespace string_length_impl = sve;
#elif defined(__ARM_NEON)
namespace string_length_impl = neon;
#endif
} // namespace LIBC_NAMESPACE_DECL
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
